{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "unable to import 'smart_open.gcs', disabling that module\n"
     ]
    }
   ],
   "source": [
    "import plotly\n",
    "import sklearn.decomposition\n",
    "import gensim\n",
    "import numpy\n",
    "import jieba\n",
    "import itertools\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读入语料\n",
    "sentences=gensim.models.word2vec.Text8Corpus('data/text8')\n",
    "#训练word2vec模型\n",
    "#size-->dim of word2vec\n",
    "model=gensim.models.word2vec.Word2Vec(sentences,size=300)\n",
    "#保存模型\n",
    "model.save('output/text8.w2v')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LERRY\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning:\n",
      "\n",
      "Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "\n"
     ]
    }
   ],
   "source": [
    "model=gensim.models.Word2Vec.load('output/text8.w2v')\n",
    "#装载词向量\n",
    "all_word_vector=model[model.wv.vocab]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LERRY\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: DeprecationWarning:\n",
      "\n",
      "Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "\n",
      "C:\\Users\\LERRY\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: DeprecationWarning:\n",
      "\n",
      "Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "\n"
     ]
    }
   ],
   "source": [
    "start_word='apple'\n",
    "topn=50\n",
    "pca=sklearn.decomposition.PCA(n_components=3)\n",
    "pca.fit(all_word_vector)\n",
    "#收集与start_word最相似的词向量\n",
    "similar_word_list=[start_word]+[pair[0] for pair in model.most_similar(start_word,topn=topn)]\n",
    "similar_word_list=[model[word] for word in similar_word_list]\n",
    "#降维\n",
    "decomposed_vector=pca.transform(similar_word_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'output/word2vec.html'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#设置坐标图中画出的点的坐标，文本标注的位置和颜色\n",
    "trace=plotly.graph_objs.Scatter3d(\n",
    "    x=decomposed_vector[:,0],\n",
    "    y=decomposed_vector[:,1],\n",
    "    z=decomposed_vector[:,2],\n",
    "    mode='markers+text',\n",
    "    text=similar_word_list,\n",
    "    textposition='bottom center',\n",
    "    marker=dict(color=[256-int(numpy.linalg.norm(decomposed_vector[i] -decomposed_vector[0])) for i in range(len(similar_word_list))])\n",
    ")\n",
    "layout=plotly.graph_objs.Layout(title='top'+str(topn)+'Word Most Similar With \\''+start_word+'\\'')\n",
    "data=[trace]\n",
    "figure=plotly.graph_objs.Figure(data=data,layout=layout)\n",
    "graph_name='output/word2vec.html'\n",
    "#绘图\n",
    "plotly.offline.plot(figure,filename=graph_name,auto_open=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LERRY\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning:\n",
      "\n",
      "Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('has', 0.7455781102180481),\n",
       " ('having', 0.5528787970542908),\n",
       " ('had', 0.5184059739112854),\n",
       " ('is', 0.29727986454963684),\n",
       " ('earths', 0.28828588128089905),\n",
       " ('possesses', 0.2872519791126251),\n",
       " ('retains', 0.28252407908439636),\n",
       " ('carmakers', 0.27037861943244934),\n",
       " ('variation', 0.2702816128730774),\n",
       " ('was', 0.26834139227867126)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(positive=['does','have'],negative=['do'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LERRY\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning:\n",
      "\n",
      "Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('queen', 0.6340309381484985),\n",
       " ('princess', 0.5549601316452026),\n",
       " ('daughter', 0.5512590408325195),\n",
       " ('throne', 0.5421915054321289),\n",
       " ('empress', 0.5357568860054016),\n",
       " ('prince', 0.533805251121521),\n",
       " ('consort', 0.5334736108779907),\n",
       " ('husband', 0.53216153383255),\n",
       " ('regent', 0.5257271528244019),\n",
       " ('son', 0.5249631404876709)]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(positive=['woman', 'king'], negative=['man'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
